package net.ming616.nlp.extraction.service.impl;

import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import net.ming616.nlp.extraction.service.HTMLExtractor;
import net.ming616.nlp.extraction.service.WenwenExtractor;
import net.ming616.nlp.qa.model.Question;

import org.apache.commons.io.FileUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

@Service("wenwenExtractor")
public class WenwenExtractorImpl implements WenwenExtractor {

	@Autowired
	HTMLExtractor htmlExtractor;

	public Question getQuestion(String url) {
		Map<String, String> selectors = new HashMap<String, String>();
		selectors.put("title", "div.question_main H3");
		selectors.put("content", "div.question_con");
		selectors.put("officialAnswer", "div.sloved_answer_main .answer_con");
		selectors.put("starAnswer", "div.sloved_answer_main2 .answer_con");
		Map<String, String> contentMap = this.htmlExtractor.getContent(url,
				selectors);
		Question q = new Question();
		q.setTitle(contentMap.get("title"));
		q.setContent(contentMap.get("content"));
		q.setOfficeAnswer(contentMap.get("officialAnswer"));
		q.setStarAnswer(contentMap.get("starAnswer"));
		return q;
	}

	private int getPageNum(String url) {
		Map<String, String> selectors = new HashMap<String, String>();
		selectors.put("num", ".extend + li");
		Map<String, String> resultMap = this.htmlExtractor.getContent(url,
				selectors);
		int num = 0;
		if (null != resultMap.get("num")) {
			num = Integer.valueOf(resultMap.get("num").trim());
		}
		return num;
	}

	public List<String> getQuestionURLList(String url) {
		List<String> lines = null;
		try {
			lines = FileUtils
					.readLines(new File("E:/ming616/data/question.txt"));
		} catch (IOException e) {
			e.printStackTrace();
		}
		return lines;
	}

	public List<String> extractQuestionURLList(String url) {
		int pageNum = this.getPageNum(url);
		List<String> urlList = new ArrayList<String>();
		for (int i = 0; i < pageNum; i++) {
			String pageURL = url + "&pg=" + i;
			URL baseURL;
			try {
				baseURL = new URL(pageURL);
				Document doc = Jsoup.parse(baseURL, 6000);
				Elements elements = doc
						.select(".questions_wrap a[target=_blank]");
				for (Element element : elements) {
					String href = baseURL.getProtocol() + "://"
							+ baseURL.getHost() + element.attr("href");
					urlList.add(href);
				}
			} catch (MalformedURLException e) {
				e.printStackTrace();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		try {
			FileUtils.writeLines(new File("E:/ming616/data/question.txt"),
					urlList);
		} catch (IOException e) {
			e.printStackTrace();
		}
		return urlList;
	}

}
